In [1]:
import numpy as np
import sklearn
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib
def is_covariance(x):
# checks if x is a valid covariance matrix (symmetric and pos-def)
symmetric= np.all(x==x.T)
pos_def=np.all(np.linalg.eigvals(x) > 0)
return symmetric and pos_def
def plot_data(x,title,eigen=False):
# plot 3d data
# x must be of size (n,3)
# if eigen=True, also plot eigenvectors of cov(x), with the corresponding eigenvalue
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x[:,0],x[:,1],zs=x[:,2],alpha=0.1)
limit_max=np.max(x)+1
limit_min=np.min(x)-1
ax.set_xlabel("x")
ax.set_ylabel("y")
ax.set_zlabel("z")
ax.set_xlim(limit_min, limit_max)
ax.set_ylim(limit_min, limit_max)
ax.set_zlim(limit_min, limit_max)
ax.set_title(title)
if eigen==True:
n,features=x.shape
mu=np.mean(x,axis=0)
pca = PCA(n_components=features)
pca.fit(x)
eigenvectors=pca.components_ # each row is an eigenvector
eigenvalues=pca.explained_variance_ # eigenvalues
scale=np.abs(x).max()
#scaled_eigenvectors=eigenvectors/(eigenvalues+0.2)*scale
scaled_eigenvectors=eigenvectors*scale/4
ax.scatter([mu[0]],[mu[1]],zs=[mu[2]],color="green",s=150)
endpoints=scaled_eigenvectors+mu
for i in range(features):
ax.plot([mu[0], endpoints[i,0]], [mu[1],endpoints[i,1]], [mu[2], endpoints[i,2]], color='red', alpha=0.8, lw=2)
ax.text(endpoints[i,0],endpoints[i,1],endpoints[i,2], '$\lambda_{%d}$=%0.2f' % (i,eigenvalues[i]), size=8, zorder=1)
Generate data to simulate a dataset of samples $x$ in which all features/columns (3) could be collected. x has size $n$ by $features$.
In [2]:
#mean and std of multivariate normal dist to generate samples
mu=np.array([5,0,-2])
σ=np.array([[9,1, -1],
[1, 3, -2],
[-1, -2,2],])
if not is_covariance(σ):
print("Warning: σ is not a valid covariance matrix (not symmetric or positive definite)")
n=1000 # number of samples
x=np.random.multivariate_normal(mu,σ,size=(n,))
#plot generated data
plt.close("all")
plot_data(x,"data in original space")
plt.show()
Generate another dataset $y$ with the same distribution as $x$ (this is a very strong assumption!)
In [3]:
y=np.random.multivariate_normal(mu,σ,size=(n,))
plot_data(y,"y in original space")
Lets simulate the fact that for $y$ we can't measure all values. In this case, we will create y_missing
, which only has 2 features
In [4]:
y_missing=y[:,0:2]
plt.figure()
plt.scatter(y_missing[:,0],y_missing[:,1])
plt.title("y_missing in original space (2d)")
Out[4]:
Now, lets assume that we can recover the last feature of y
using a Linear Regression model trained on $x$.
In [5]:
from sklearn import linear_model
model = linear_model.LinearRegression()
model.fit(x[:,0:2],x[:,2])
y3=model.predict(y_missing)
We can now create a reconstructed version of y, with the 2 original columns (y_missing
) and the predicted value (y_3
)
In [6]:
y3=y3.reshape((-1,1))
print(y_missing.shape,y3.shape)
y_reconstructed=np.hstack([y_missing,y3])
plot_data(y_reconstructed, "y_reconstructed")
In [7]:
mean_reconstruction_error=((y_reconstructed-y)**2).sum()/n
print(mean_reconstruction_error)